#Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Loading the dataset
df = pd.read_csv('usedCars.csv')
df.head()

#Shape of the dataset
df.shape

(1064, 19)

#colums in the dataset
df.columns

Index(['Id', 'Company', 'Model', 'Variant', 'FuelType', 'Colour', 'Kilometer',
       'BodyStyle', 'TransmissionType', 'ManufactureDate', 'ModelYear',
       'CngKit', 'Price', 'Owner', 'DealerState', 'DealerName', 'City',
       'Warranty', 'QualityScore'],
      dtype='object')

#dropping column ID, as it is a identifier and not required for analysis
df.drop('Id',axis=1,inplace=True)

#Column Data Types
df.dtypes

Company              object
Model                object
Variant              object
FuelType             object
Colour               object
Kilometer             int64
BodyStyle            object
TransmissionType     object
ManufactureDate      object
ModelYear             int64
CngKit               object
Price                object
Owner                object
DealerState          object
DealerName           object
City                 object
Warranty              int64
QualityScore        float64
dtype: object

def convert_amount(amount_str):
    if "Lakhs" in amount_str:
        return float(amount_str.replace(' Lakhs', '').replace(',', '')) * 100000
    else:
        return float(amount_str.replace(',', ''))

df['Price'] = df['Price'].apply(convert_amount)

#Checking for null values percentage wise
df.isnull().sum()/df.shape[0]*100

Company              0.000000
Model                0.000000
Variant              0.000000
FuelType             0.093985
Colour               0.000000
Kilometer            0.000000
BodyStyle            0.000000
TransmissionType    67.105263
ManufactureDate      0.000000
ModelYear            0.000000
CngKit              97.932331
Price                0.000000
Owner                0.000000
DealerState          0.000000
DealerName           0.000000
City                 0.000000
Warranty             0.000000
QualityScore         0.000000
dtype: float64

df.drop('CngKit', axis=1, inplace=True)

#Dropping TransmissionType column
df.drop('TransmissionType',axis=1,inplace=True)

#Removing null values from FuelType column
df['FuelType'].dropna(inplace=True)

df.drop('ManufactureDate', axis = 1, inplace=True)

df.drop('Variant', axis = 1, inplace=True)

df['ModelYear'] = 2023 - df['ModelYear']
df.rename(columns={'ModelYear':'Age'},inplace=True)

for i in df.columns:
    print(i,df[i].nunique())

Company 23
Model 218
FuelType 5
Colour 76
Kilometer 1006
BodyStyle 10
Age 17
Price 362
Owner 4
DealerState 10
DealerName 57
City 11
Warranty 2
QualityScore 43

df.describe()

df.head()

#Number of cars by company
sns.countplot(df['Company'],order=df['Company'].value_counts().index, palette = 'Set1').set_title('Number of cars by company')

Text(0.5, 1.0, 'Number of cars by company')

#Top 10 cars models by number
sns.countplot(df['Model'],order=df['Model'].value_counts().iloc[:10].index, palette = 'Set1').set_title('Top 10 Car Models')

Text(0.5, 1.0, 'Top 10 Car Models')

#Cars count by fuel type
sns.countplot(x = 'FuelType', data = df, palette = 'Set1').set_title('Number of cars by Fuel Type')

Text(0.5, 1.0, 'Number of cars by Fuel Type')

#Top 10 colors of cars
sns.countplot(x = 'Colour', data = df, order = df['Colour'].value_counts().iloc[:10].index).set_title('Top 10 Car Colours')
plt.xticks(rotation = 90)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, 'White'),
  Text(1, 0, 'Silver'),
  Text(2, 0, 'Grey'),
  Text(3, 0, 'Red'),
  Text(4, 0, 'Black'),
  Text(5, 0, 'Brown'),
  Text(6, 0, 'Blue'),
  Text(7, 0, 'A Blue'),
  Text(8, 0, 'Pearl White'),
  Text(9, 0, 'Orange')])

#Odometer reading distribution
sns.histplot(x = 'Kilometer', data = df, bins = 20).set_title('Odometer Reading')

Text(0.5, 1.0, 'Odometer Reading')

#Body style count
sns.countplot(x = 'BodyStyle', data = df).set_title('Number of cars by Body Style')
plt.xticks(rotation = 90)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, 'HATCHBACK'),
  Text(1, 0, 'SEDAN'),
  Text(2, 0, 'MPV'),
  Text(3, 0, 'SUV'),
  Text(4, 0, 'HATCHBACK '),
  Text(5, 0, 'VAN'),
  Text(6, 0, 'MUV'),
  Text(7, 0, 'COMPACTSUV'),
  Text(8, 0, 'Sedan'),
  Text(9, 0, 'SUV ')])

#Car age distribution
sns.histplot(x = 'Age', data = df, bins = 20).set_title('Car age distribution')

Text(0.5, 1.0, 'Car age distribution')

#Price distribution
sns.histplot(x = 'Price', data =df, bins = 30).set_title('Car price distribution')

Text(0.5, 1.0, 'Car price distribution')

fig, ax = plt.subplots(1,3,figsize=(20,7))

#Dealer State
sns.countplot(x = 'DealerState', data = df, ax = ax[0]).set_title('Dealer States')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation = 90)

#City
sns.countplot(x = 'City', data = df, ax = ax[1]).set_title('City')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)

#top 10 dealers
sns.countplot(x = 'DealerName', data = df, order = df['DealerName'].value_counts().iloc[:10].index, ax = ax[2]).set_title('Top 10 Dealers')
ax[2].set_xticklabels(ax[2].get_xticklabels(), rotation = 90)

[Text(0, 0, 'Car Choice Exclusif'),
 Text(1, 0, 'Car&Bike Superstore Pune'),
 Text(2, 0, 'Prestige Autoworld Pvt Ltd'),
 Text(3, 0, 'Star Auto India'),
 Text(4, 0, 'Noida Car Ghar'),
 Text(5, 0, 'Top Gear Cars'),
 Text(6, 0, 'Car Estate'),
 Text(7, 0, 'OM Motors'),
 Text(8, 0, 'Jeen Mata Motors'),
 Text(9, 0, 'Royal Motors (Prop. Auto Carriage Pvt Ltd)')]

sns.countplot(x = 'Owner', data = df).set_title('Number of cars by Owner Type')

Text(0.5, 1.0, 'Number of cars by Owner Type')

sns.countplot(x = 'Warranty', data = df).set_title('Number of cars by Warranty')

Text(0.5, 1.0, 'Number of cars by Warranty')

sns.histplot(x = 'QualityScore', data = df, bins = 10).set_title('Quality Score Distribution')

Text(0.5, 1.0, 'Quality Score Distribution')

#Top 10 car companies by price
sns.barplot(y = 'Company', x = 'Price', data = df, order = df.groupby('Company')['Price'].mean().sort_values(ascending=False).iloc[:10].index, hue = 'Company', palette= 'Set1').set_title('Top 10 car Companies by price')

Text(0.5, 1.0, 'Top 10 car Companies by price')

#Top 10 car models by price
sns.barplot(y = 'Model', x = 'Price', data = df, order = df.groupby('Model')['Price'].mean().sort_values(ascending=False).iloc[:10].index, hue = 'Model', palette= 'Set1').set_title('Top 10 car Models by price')

Text(0.5, 1.0, 'Top 10 car Models by price')

fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(x = 'FuelType', y = 'Price', data = df, ax = ax[0], hue = 'FuelType').set_title('Price by Fuel Type')
sns.violinplot(x = 'FuelType', y = 'Price', data = df, ax = ax[1], hue = 'FuelType').set_title('Price by Fuel Type')

Text(0.5, 1.0, 'Price by Fuel Type')

#Top 10 car colors by price
sns.barplot(y = 'Colour', x = 'Price', data = df, order = df.groupby('Colour')['Price'].mean().sort_values(ascending=False).iloc[:10].index).set_title('Top 10 car Colors by price')

Text(0.5, 1.0, 'Top 10 car Colors by price')

sns.scatterplot(x = 'Kilometer', y = 'Price', data = df).set_title('Odometer Reading and Price')

Text(0.5, 1.0, 'Odometer Reading and Price')

sns.barplot(x = 'BodyStyle', y = 'Price', data = df, hue = 'BodyStyle').set_title('Price by Body Style')
plt.xticks(rotation = 90)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, 'HATCHBACK'),
  Text(1, 0, 'SEDAN'),
  Text(2, 0, 'MPV'),
  Text(3, 0, 'SUV'),
  Text(4, 0, 'HATCHBACK '),
  Text(5, 0, 'VAN'),
  Text(6, 0, 'MUV'),
  Text(7, 0, 'COMPACTSUV'),
  Text(8, 0, 'Sedan'),
  Text(9, 0, 'SUV ')])

sns.barplot(x = 'Age', y = 'Price', data = df).set_title('Car age and Price')

Text(0.5, 1.0, 'Car age and Price')

fig, ax = plt.subplots(1,3,figsize=(20,7))

#Dealer State
sns.violinplot(x = 'DealerState', y = 'Price', data = df, ax = ax[0], hue = 'DealerState').set_title('Dealer States')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation = 90)

#City
sns.violinplot(x = 'City',y = 'Price', data = df, ax = ax[1], hue = 'City').set_title('City')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)

#top 10 dealers
sns.violinplot(x = 'DealerName',y = 'Price', data = df, order = df['DealerName'].value_counts().iloc[:10].index, ax = ax[2], hue = 'DealerName').set_title('Top 10 Dealers')
ax[2].set_xticklabels(ax[2].get_xticklabels(), rotation = 90)

[Text(0, 0, 'Car Choice Exclusif'),
 Text(1, 0, 'Car&Bike Superstore Pune'),
 Text(2, 0, 'Prestige Autoworld Pvt Ltd'),
 Text(3, 0, 'Star Auto India'),
 Text(4, 0, 'Noida Car Ghar'),
 Text(5, 0, 'Top Gear Cars'),
 Text(6, 0, 'Car Estate'),
 Text(7, 0, 'OM Motors'),
 Text(8, 0, 'Jeen Mata Motors'),
 Text(9, 0, 'Royal Motors (Prop. Auto Carriage Pvt Ltd)')]

sns.violinplot(x = 'Owner', y = 'Price', data = df, hue = 'Owner').set_title('Price by Owner Type')

Text(0.5, 1.0, 'Price by Owner Type')

sns.violinplot(x = 'Warranty', y = 'Price', data = df, hue = 'Warranty').set_title('Price by Warranty')

Text(0.5, 1.0, 'Price by Warranty')

sns.scatterplot(x = 'QualityScore', y = 'Price', data = df).set_title('Quality Score and Price')

Text(0.5, 1.0, 'Quality Score and Price')

df.drop('Model', axis = 1, inplace = True)

#columns for label encoding
cols = df.select_dtypes(include=['object']).columns

from sklearn.preprocessing import LabelEncoder
#Label encoding object
le = LabelEncoder()

#label encoding for object type columns
for i in cols:
    le.fit(df[i])
    df[i] = le.transform(df[i])
    print(i, df[i].unique())

Company [12  7 19  5 13 21 11  6 17 16  9  4 20 10  1  3 18 14  0  8 22 15  2]
FuelType [4 1 0 2 5 3]
Colour [61 56 34  0  9 11 66 47 49 38 14 71 72 30 74 52 39 28 60  7 54 62 40 13
 20 70 63 12 24 23 35 26 29 15 31  1 68  4  8 73 22 44 57 65 42 50 32 64
 19 43 46 33 16 27 53 25 10 69 51 17  6 48 59 58  5  3 18 45 67 36 21 55
  2 37 75 41]
BodyStyle [1 5 3 6 2 9 4 0 8 7]
Owner [0 1 2 3]
DealerState [2 4 0 1 8 7 3 6 9 5]
DealerName [52 38  4  1 56 29  0 34 47 51 11 21  9 10 43 33  7 16  5 12 42 17 27 50
 45  6 20 36 23 41 32 31 18  2 48 15 54 40 55 13 49 25 35 46 24 14 44 19
 39 28 26  3 53 30  8 22 37]
City [ 0 10  2  3  9  4  5  8  1  7  6]

#Using IQRS to remove outliers

#columns for outlier removal
cols = df.select_dtypes(include=['int64','float64']).columns
Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)

IQR = Q3 - Q1

#Removing outliers
df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)

<Axes: >

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Price',axis=1), df['Price'], test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeRegressor
#Decision Tree Regressor Object
dtr = DecisionTreeRegressor()

from sklearn.model_selection import GridSearchCV

#parameters for grid search
para = {
    'max_depth' : [2,4,6,8],
    'min_samples_leaf' : [2,4,6,8],
    'min_samples_split' : [2,4,6,8],
    'random_state' : [0,42]
}

#Grid Search Object
grid = GridSearchCV(estimator=dtr, param_grid=para, cv=5, n_jobs=-1, verbose=2)

#Fitting the model
grid.fit(X_train, y_train)

#Best parameters
print(grid.best_params_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
{'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 42}

#decision tree regressor with best parameters
dtr = DecisionTreeRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=2, random_state=42)

#Fitting the model
dtr.fit(X_train, y_train)

#Training score
print(dtr.score(X_train, y_train))

0.7445153281346839

#Prediction
dtr_pred = dtr.predict(X_test)

from sklearn.ensemble import RandomForestRegressor
#Random Forest Regressor Object
rfr = RandomForestRegressor()

from sklearn.model_selection import GridSearchCV

#parameters for grid search
para = {
    'max_depth' : [2,4,6,8],
    'min_samples_leaf' : [2,4,6,8],
    'min_samples_split' : [2,4,6,8],
    'random_state' : [0,42]
}

#Grid Search Object
grid = GridSearchCV(estimator=rfr, param_grid=para, cv=5, n_jobs=-1, verbose=2)

#Fitting the model
grid.fit(X_train, y_train)

#Best parameters
print(grid.best_params_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
{'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 0}

#Random Forest Regressor with best parameters
rfr = RandomForestRegressor(max_depth=8, min_samples_leaf=2, min_samples_split=2, random_state=0)

#Fitting the model
rfr.fit(X_train, y_train)

#Training score
print(rfr.score(X_train, y_train))

0.8781873430425237

#Prediction
rfr_pred = rfr.predict(X_test)

fig,ax = plt.subplots(1,2,figsize=(10,5))

#decision tree regressor
sns.distplot(x = y_test, ax = ax[0], color = 'r', hist = False, label = 'Actual').set_title('Decision Tree Regressor')
sns.distplot(x = dtr_pred, ax = ax[0], color = 'b', hist = False, label = 'Predicted')

#random forest regressor
sns.distplot(x = y_test, ax = ax[1], color = 'r', hist = False, label = 'Actual').set_title('Random Forest Regressor')
sns.distplot(x = rfr_pred, ax = ax[1], color = 'b', hist = False, label = 'Predicted')

<Axes: title={'center': 'Random Forest Regressor'}, ylabel='Density'>

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#Decision Tree Regressor
print('Decision Tree Regressor')
print('Mean Squared Error : ', mean_squared_error(y_test, dtr_pred))
print('Mean Absolute Error : ', mean_absolute_error(y_test, dtr_pred))
print('R2 Score : ', r2_score(y_test, dtr_pred))

#Random Forest Regressor
print('Random Forest Regressor')
print('Mean Squared Error : ', mean_squared_error(y_test, rfr_pred))
print('Mean Absolute Error : ', mean_absolute_error(y_test, rfr_pred))
print('R2 Score : ', r2_score(y_test, rfr_pred))

Decision Tree Regressor
Mean Squared Error :  46746127636.183586
Mean Absolute Error :  161645.14749542723
R2 Score :  0.5660724036960223
Random Forest Regressor
Mean Squared Error :  31811887039.002945
Mean Absolute Error :  134717.2267038187
R2 Score :  0.704701621829243

fig, ax = plt.subplots(1,2,figsize=(15, 5))
fig.subplots_adjust(wspace=0.5)

#Decision Tree Regressor
feature_df = pd.DataFrame({'Features':X_train.columns, 'Importance':dtr.feature_importances_})
feature_df.sort_values(by='Importance', ascending=False, inplace=True)
sns.barplot(x = 'Importance', y = 'Features', data = feature_df, ax = ax[0]).set_title('Decision Tree Regressor')

#Random Forest Regressor
feature_df = pd.DataFrame({'Features':X_train.columns, 'Importance':rfr.feature_importances_})
feature_df.sort_values(by='Importance', ascending=False, inplace=True)
sns.barplot(x = 'Importance', y = 'Features', data = feature_df, ax = ax[1]).set_title('Random Forest Regressor')

Text(0.5, 1.0, 'Random Forest Regressor')

Column Name	Description
ID	Unique ID for each listing
Company	Name of the car manufacturer
Model	Name of the car model
Variant	Name of the car variant
Fuel Type	Fuel type of the car
Color	Color of the car
Killometer	Number of kilometers driven by the car
Body Style	Body style of the car
Transmission Type	Transmission type of the car
Manufacture Date	Manufacture date of the car
Model Year	Model year of the car
CngKit	Whether the car has a CNG kit or not
Price	Price of the car
Owner Type	Number of previous owners of the car
Dealer State	State in which the car is being sold
Dealer Name	Name of the dealer selling the car
City	City in which the car is being sold
Warranty	Warranty offered by the dealer
Quality Score	Quality score of the car

	Id	Company	Model	Variant	FuelType	Colour	Kilometer	BodyStyle	TransmissionType	ManufactureDate	ModelYear	CngKit	Price	Owner	DealerState	DealerName	City	Warranty	QualityScore
0	555675	MARUTI SUZUKI	CELERIO(2017-2019)	1.0 ZXI AMT O	PETROL	Silver	33197	HATCHBACK	NaN	2018-02-01	2018	NaN	5.75 Lakhs	1st Owner	Karnataka	Top Gear Cars	Bangalore	1	7.8
1	556383	MARUTI SUZUKI	ALTO	LXI	PETROL	Red	10322	HATCHBACK	Manual	2021-03-01	2021	NaN	4.35 Lakhs	1st Owner	Karnataka	Renew 4 u Automobiles PVT Ltd	Bangalore	1	8.3
2	556422	HYUNDAI	GRAND I10	1.2 KAPPA ASTA	PETROL	Grey	37889	HATCHBACK	Manual	2015-03-01	2015	NaN	4.7 Lakhs	1st Owner	Karnataka	Anant Cars Auto Pvt Ltd	Bangalore	1	7.9
3	556771	TATA	NEXON	XT PLUS	PETROL	A Blue	13106	HATCHBACK	NaN	2020-08-01	2020	NaN	9.9 Lakhs	1st Owner	Karnataka	Adeep Motors	Bangalore	1	8.1
4	559619	FORD	FIGO	EXI DURATORQ 1.4	DIESEL	Silver	104614	HATCHBACK	Manual	2010-11-01	2010	NaN	2.7 Lakhs	2nd Owner	Karnataka	Zippy Automart	Bangalore	0	7.5

	Kilometer	Age	Price	Warranty	QualityScore
count	1064.000000	1064.000000	1.064000e+03	1064.000000	1064.000000
mean	52807.187970	6.135338	8.350536e+05	0.738722	7.770207
std	33840.296979	2.996786	5.726538e+05	0.439538	0.719717
min	101.000000	0.000000	9.500000e+04	0.000000	0.000000
25%	32113.500000	4.000000	4.850000e+05	0.000000	7.500000
50%	49432.000000	6.000000	6.750000e+05	1.000000	7.800000
75%	68828.500000	8.000000	9.850000e+05	1.000000	8.100000
max	640000.000000	20.000000	8.500000e+06	1.000000	9.400000

	Company	Model	FuelType	Colour	Kilometer	BodyStyle	Age	Price	Owner	DealerState	DealerName	City	Warranty	QualityScore
0	MARUTI SUZUKI	CELERIO(2017-2019)	PETROL	Silver	33197	HATCHBACK	5	575000.0	1st Owner	Karnataka	Top Gear Cars	Bangalore	1	7.8
1	MARUTI SUZUKI	ALTO	PETROL	Red	10322	HATCHBACK	2	435000.0	1st Owner	Karnataka	Renew 4 u Automobiles PVT Ltd	Bangalore	1	8.3
2	HYUNDAI	GRAND I10	PETROL	Grey	37889	HATCHBACK	8	470000.0	1st Owner	Karnataka	Anant Cars Auto Pvt Ltd	Bangalore	1	7.9
3	TATA	NEXON	PETROL	A Blue	13106	HATCHBACK	3	990000.0	1st Owner	Karnataka	Adeep Motors	Bangalore	1	8.1
4	FORD	FIGO	DIESEL	Silver	104614	HATCHBACK	13	270000.0	2nd Owner	Karnataka	Zippy Automart	Bangalore	0	7.5

Indian Used Car Price Prediction¶

About the Dataset¶

Data Dictionary¶

Data Preprocessing Part 1¶

Exploratory Data Analysis¶

Car Company¶

Top 10 Car Models¶

Car Fuel Type¶

Top 10 Colors for Cars¶

Odometre Reading¶

Body Style¶

Car Age Distribution¶

Price Distribution¶

Location based Distribution¶

Car Owner Type¶

Warranty¶

Quality Score Distribution¶

Till now, I have visualized the distribution of the data and got a better understanding of the data. Now, I will be looking at the relationship between the Car Price aans the independent variables.¶

Top 10 Car Companies by Price¶

Top 10 Car Models by Price¶

Car Fuel Type and Price¶

Top 10 Car Colors by Price¶

Odometer Reading and Price¶

Body Style and Price¶

Car Age and Price¶

Location based Price Distribution¶

Car Owner Type and Price¶

Warranty and Price¶

Quality Score and Price¶

Data Preprocessing Part 2¶

Label Encoding¶

Outlier Removal¶

Correlation Matrix Heatmap¶

Train Test Split¶

Model Building¶

Decision Tree Regressor¶

Hyperparameter Tuning¶

Random Forest Regressor¶

Hyperparameter Tuning¶

Model Evaluation¶

Distribution Plot¶

Model Metrics¶

Feature Importance¶

Conclusion¶